#!/usr/bin/env python
# -*- coding: utf-8 -*-
import collections
import logging
import multiprocessing
import os
import re
import traceback
from lxml import etree
from datetime import datetime

import requests


class Config:
    class logger:
        name = 'proxy_checkxici'  # DEFAULT default
        path = '/data/proxy_pool/logs'  # DEFAULT .
        stream_level = logging.INFO
        file_level = logging.WARNING

    multiworker = 4
    file_path = '/data/proxy_pool/logs/proxy_group.conf'
    proxy_count = 30
    reload = 'bash /data/proxy_pool/src/nginx.sh reload'
    interval = 5


def init_logger(name, subname='default', workspace='.', multiproc=False, stream_level=None, file_level=None):
    if not os.path.exists(workspace):
        os.makedirs(workspace)

    if multiproc:
        log = multiprocessing.get_logger()
    else:
        log = logging.getLogger(name)

    log.setLevel(logging.DEBUG)
    fmt = logging.Formatter(r'[%(levelname)s][%(asctime)s %(filename)s:%(lineno)d] %(message)s')

    if stream_level:
        sh = logging.StreamHandler()
        sh.setFormatter(fmt)
        sh.setLevel(stream_level)
        log.addHandler(sh)
    else:
        sh = None

    if file_level:
        fh = logging.FileHandler('%s/%s-%s.log' % (workspace, subname, datetime.strftime(datetime.now(), '%Y%m%d')))
        fh.setFormatter(fmt)
        fh.setLevel(file_level)
        log.addHandler(fh)
    else:
        fh = None

    return log, sh, fh


def multijobs(target, argslist, workers=None):
    if not workers:
        workers = multiprocessing.cpu_count()
    workers = min(len(argslist), workers)
    msgq = multiprocessing.Queue()

    # target wapper
    def worker(target, args):
        try:
            pid = os.getpid()
            res = target(*args)
            msgq.put((pid, res, None))
        except Exception, e:
            msgq.put((pid, None, (e, traceback.format_exc())))

    # add proc to waiting
    waiting = collections.deque()
    for args in argslist:
        args_wapper = [target, args]
        proc = multiprocessing.Process(target=worker, args=args_wapper)
        waiting.append(proc)

    seq = []
    running = set()
    results = {}
    dataNum = 1
    while True:
        # move proc from waiting to running
        for i in xrange(max(0, min(len(waiting), workers - len(running)))):
            proc = waiting.popleft()
            running.add(proc)
            proc.start()
            seq.append(proc.pid)

        if len(running) == 0:
            break

        # if len(running) > 0, wait for blocking msgq.get() instead of time.sleep()
        for i in xrange(dataNum):
            pid, res, err = msgq.get()
            results[pid] = (res, err)
        dataNum = 0

        # check proc is alive or not
        dead = []
        for proc in running:
            if not proc.is_alive():
                dead.append(proc)

        # remove dead proc from running
        for proc in dead:
            running.remove(proc)
            proc.join()
            dataNum += 1

    # put msgq data left to results map
    while not msgq.empty():
        pid, res, err = msgq.get()
        results[pid] = (res, err)
    msgq.close()

    # collect retults of child proc
    ret = []
    for pid in seq:
        ret.append(results[pid])
    return ret


def proxy_check(ip, log):
    proxies = {'http': ip, 'https': ip}
    for i in xrange(3):
        try:
            if check(proxies, i):
                log.info('ok check->%d|ip->%s', i, ip)
                return ip
        except:
            log.debug('fail check->%d|ip->%s', i, ip)
    log.info('fail ip->%s', ip)
    return None


def check(proxy, num):
    tencent = 'http://www.qq.com/robots.txt'
    baidu = 'http://baidu.com/robots.txt'
    ali = 'https://www.taobao.com/robots.txt'
    test_url = (ali, baidu, tencent)
    requests.get(test_url[num], proxies=proxy, timeout=5)
    return True


def write_conf(proxy_list, file_path):
    try:
        file_path = os.path.abspath(file_path)
        with open(file_path, 'w') as f:
            for proxy in proxy_list:
                f.write('server %s;\n' % (proxy,))
            f.flush()
        return True
    except:
        log.error('write nginx conf error', exc_info=True)
        return False


def nginx_reload():
    if os.system(Config.reload) == 0:
        log.debug('nginx reload ok')
    else:
        log.error('nginx reload error', exc_info=True)


if __name__ == '__main__':
    log, _, _ = init_logger(name=Config.logger.name,
                            subname=Config.logger.name,
                            workspace=Config.logger.path,
                            stream_level=Config.logger.stream_level,
                            file_level=Config.logger.file_level,
                            multiproc=True)
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'}
    resp = requests.get('https://www.xicidaili.com/wn/', headers=headers)
    if resp.status_code != 200:
        log.error('get ip list not http200')
        exit(1)
    et = etree.HTML(resp.content)
    tabs = et.xpath('//*[@id="ip_list"]/tr')
    if len(tabs) <= 1:
        log.error('not find table or table is empty')
        exit(1)
    multiargs = []
    for tab in tabs[1:]:
        x = tab.xpath('.//td/text()')
        multiargs.append(("%s:%s" % (x[0], x[1]), log))
    log.debug(multiargs)
    fail_list = multijobs(proxy_check, multiargs, Config.multiworker)
    new_proxy_list = [i for i, err in fail_list if i is not None]
    log.warning('ok num->%d|fail num->%d', len(new_proxy_list), len(fail_list) - len(new_proxy_list))
    log.info('ok ips->%s', new_proxy_list)
    try:
        if write_conf(new_proxy_list, Config.file_path):
            nginx_reload()
    except:
        log.error('error', exc_info=True)
